#!/bin/sh
#
# Trying to track down assorted transient QA environment issues.
#
# Run from check after each test completes, but you need to ensure
# the script is called check.callback and it is executable.
#
# Usage: check.callback [--precheck] [seq#]
#
# Note: This is a SAMPLE script, you should either
#	(a) $ ln check.callback.sample check.callback
#           or probably better to ensure tracking any upstream changes
#           $ ln -s check.callback.sample check.callback
#	    to have it activated "as is", or
#	(b) $ cp check.callback.sample check.callback
#	    and then edit check.callback to suit local needs ... see
#	    in particular the CONFIGURE-ME comments below
#

if [ -z "$PCP_LOG_DIR" ]
then
    # running standalone (not called from check), so need to populate
    # environment
    # (logic copied from common.rc)
    #
    # source the PCP configuration environment variables
    if [ -r $PCP_DIR/etc/pcp.env ]
    then
	. $PCP_DIR/etc/pcp.env
    else
	echo "Error: unable to read $PCP_DIR/etc/pcp.env!" >&2
	exit 1
    fi
fi

# Security Enhanced Linux log messages
audit=/var/log/audit/audit.log
sudo=`which sudo`

# Did pmlogger_daily get run as expected?
# But don't bother if it is just after midnight!
#
case `pmdate %H:%M`
in
    00:[012]*)
	;;
    *)
	yesterday=`pmdate -1d %Y%m%d`
	host=`hostname`
	case $host
	in
	    # CONFIGURE-ME
	    # - may need to add extra hosts here to enable this test
	    #
	    bozo|bozo-vm|vm01|vm03|vm36)
		    if [ ! -f $PCP_LOG_DIR/pmlogger/$host/$yesterday.meta.xz -a ! -f $PCP_LOG_DIR/pmlogger/$host/$yesterday.meta ]
		    then
			# after a virgin install, or cleaning out the directory,
			# or a hostname change, there will be NO $yesterday files
			# at all, so be quiet if this is the case ...
			#
			nfile=`find  $PCP_LOG_DIR/pmlogger/$host -type f | grep "^$PCP_LOG_DIR/pmlogger/$host/$yesterday.*\\.meta" | wc -l | sed -e 's/ //g'`
			if [ "$nfile" -gt 0 ]
			then
			    echo "check.callback: fail: pmlogger_daily not well!"
			    echo "Missing $yesterday archive but $nfile component archives ..."
			    ls -l $PCP_LOG_DIR/pmlogger/$host/$yesterday.*
			fi
		    fi
		    ;;
	esac
	;;
esac

status=0
tmp=/var/tmp/check.callback-$$
trap "rm -f $tmp.*; exit \$status" 0 1 2 3 15

if [ "$1" = "--precheck" ]
then
    shift
    echo "--- start pre-check ---"
    ./941 --check $1
    ./870 --check $1
    $sudo egrep '^type=(AVC|SELINUX).*pcp' $audit >$1.pre-avc 2>/dev/null
    echo "before `wc -l <$1.pre-avc` AVC errors"
    echo "--- end pre-check ---"
    exit
fi

# CONFIGURE-ME
# Set abort to true if you want to stop QA as soon as a problem is
# seen, otherwise set it to false and the checks will be run and
# reported, but QA will continue on
# Aborting is good for hard problems, because you'd like to know which
# QA test is breaking things and there is no point continuing.
# Not aborting is good for transient problems, because the breakage may
# be repaired autonomously (usually timing related) or by a later QA
# test.
#
abort=false

# Check pmcd status
#
if ./941 --check $1 >$tmp.out 2>$tmp.err
then
    # ok
    :
else
    echo "check.callback: fail: pmcd not well!"
    cat $tmp.err $tmp.out
    $abort && status=1
fi

# Check pmlogger status
#
if ./870 --check $1 >$tmp.out.1 2>$tmp.err.1
then
    # ok
    :
else
    # may be a transient thing, e.g. cron-driven log rotation
    # wait a bit and try again
    #
    sleep 3

    if ./870 --check $1 >$tmp.out.2 2>$tmp.err.2
    then
	# ok this time
	:
    else
	# failed twice ... report first failure and if second failure
	# is different, report that also
	#
	cat $tmp.err.1 $tmp.out.1
	diff=false
	if cmp $tmp.err.1 $tmp.err.2 >/dev/null
	then
	    # strip timestamps from ./870 output, then cmp
	    #
	    sed -e '/^Now:/d' <$tmp.out.1 >$tmp.tmp.1
	    sed -e '/^Now:/d' <$tmp.out.2 >$tmp.tmp.2
	    if cmp $tmp.tmp.1 $tmp.tmp.2 >/dev/null
	    then
		:
	    else
		diff=true
	    fi
	else
	    diff=true
	fi
	if $diff
	then
	    echo "... and different output from 3 seconds later"
	    cat $tmp.err.2 $tmp.out.2
	fi
	echo "check.callback: fail: pmlogger not well!"
	$abort && status=1
    fi
fi

# Check are all pmdas alive and well
#
if pminfo -f pmcd.agent.status >$tmp.out 2>$tmp.err
then
    rm -f $tmp.pmdalist
    sed -n <$tmp.out \
	-e '/^  *inst /{
s/^  *inst \[//
s/ or / /
s/"//g
s/] value / /
p
}' \
    | while read domain name status
    do
	if [ "$status" != 0 ]
	then
	    echo "check.callback: fail: pmda $name not well, status=$status!"
	    $abort && status=1
	fi
	echo "$name" >>$tmp.pmdalist
    done
    if [ -f $tmp.pmdalist.prev ]
    then
	if diff -u $tmp.pmdalist.prev $tmp.pmdalist >$tmp.out
	then
	    :
	else
	    echo "check.callback: fail: installed pmdas changed!"
	    cat $tmp.out
	    $abort && status=1
	fi
    fi
    cp $tmp.pmdalist $tmp.pmdalist.prev
else
    echo "check.callback: fail: pminfo not well!"
    cat $tmp.err $tmp.out
    $abort && status=1
fi

# Check the PMNS ... specifically looking for cases where a PMDA is
# Install(ed) and pmcd.conf is restored without a Remove ... this leaves
# the PMDA not in pmcd.conf, but with bogus entries in the PMNS
#
# CONFIGURE-ME
# - may need to add metrics (in the sed part) that are OK to return
#   'Unknown or illegal metric', but this is most unlikely
#
pminfo -v -b 5000 2>&1 \
| grep 'Unknown or illegal metric' \
| sed >$tmp.out \
    -e '/^sample.*\.bad\.unknown:/d' \
    -e '/^pmproxy\.pid:/d' \
    # end
if [ -s $tmp.out ]
then
    echo "check.callback: fail: PMNS not well!"
    cat $tmp.out
    $abort && status=1
fi

# More pmcd/pmda/pmns checks and check for config files not returned
# to their pre-QA state
#
./1190 --check $1 >$tmp.out
if [ -s $tmp.out ]
then
    echo "check.callback: fail: see below!"
    cat $tmp.out
    $abort && status=1
fi

# Check audit log for any Security Enhanced Linux access denials
# related to PCP ...
#
$sudo egrep '^type=(AVC|SELINUX).*pcp' $audit 2>/dev/null >$1.post-avc
if [ -f $1.pre-avc ]
then
    diff $1.pre-avc $1.post-avc \
    | sed -n -e '/> /s///p' >$tmp.out
    if [ -s $tmp.out ]
    then
	echo "check.callback: fail: new SELinux/AVC denials"
	cat $tmp.out
	echo "after `wc -l <$1.post-avc` AVC errors"
	$abort && status=1
    fi
fi
rm -f $1.pre-avc $1.post-avc

exit
